Title analysis: titles of male and female speakers

Title analysis

load("data_clean/data_title.Rdata")
data.tit <- data_t

Formating tidytext

tit <- data.tit %>% dplyr::select(id,gender,position_cat, audience_n,
                              title_english) 
text_tok <- tit %>% unnest_tokens(output=word,
                                   input=title_english)
  • Excluding stopwords, e.g. “and” “or” “the” “of” “in”.

  • Standardizing plurals.

# lista das stopwords em ingles
stop_w <- tibble(word = stopwords(source = "stopwords-iso"))

#retirar do corpus as stopwords
text <- text_tok %>% 
  anti_join(stop_w, by="word") 

# retirar nĂșmeros e travessĂŁo e outras word
remover <- c("ăƒŒ", "1", "1st", "2", "364", "40", "70", "750", "aff", "da")

text <- text %>% filter(!word %in% remover )

# resolvendo plurais simples - sĂł cortando o S
plural <- c("actions","advances", "adaptations", "amphibians", "animals", "ants","anurans",
            "applications","approaches", "bees","builds", "birds",
            "cerrados","challenges",
            "continents","crops", 
            "decisions","declines","determines","determinants", "defenses",
            "dynamics",
            "economics", "ecosystems","environments", "experiences",
            "forests",
            "genetics","gifts","gradients","guides","impacts",
            "increases","interactions","lives",
            "landscapes","males","mammals", "mangroves","models","movements",
            "mutualisms","networks","neotropics",
            "opilions","phenotypes","plants","projects","paths", "perspectives",
            "populations","promotes","relationships", "relations",
            "resources","responses","roads","services","skulls","snakes","seeds",
            "spaces", "spiders","stages", "trees", "variations",
            "threats")

text$word[text$word %in% plural] <- 
  substr(text$word[text$word %in% plural],
       1,nchar(text$word[text$word %in% plural])-1)
  • Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
               c("advancement", "advance"),
               c("agricultural", "agriculture"),
               c("agro", "agriculture" ),
               c("amazonia","amazon" ),
               c("amazonian","amazon" ),
               c("andean","andes"),
               c("apply","application"),
               c("applying","application"),
               c("apidae","apis"),
               c("arachnida","arachnid"),
               c("argue","argument"),
               c("basal", "basis"),
               c("behavioral","behavior"),
               c("behavioural","behavior"),
               c("bignonieae", "bignoniaceae"),
               c("biological", "biology"),
               c("brazilian","brazil"),
               c("building","build"),
               c("changing", "change"),
               c("cnidarian", "cnidaria"),
               c("coastal","coast"),
               c("colour", "color"),
               c("colors", "color"),
               c("communities","community" ),
               c("competitive", "competition"),
               c("complexity", "complex"),
               c("convergences", "convergence"),
               c("convergent", "convergence"),
               c("cordatus","cordata.tit" ),
               c("croplands","crop"),
               c( "cultural", "culture"),
               c("darwin's", "darwin"),
               c("darwinian", "darwin"),
               c("defensive", "defense"),
               c("dependent","dependence"),
               c("detecting","detection"),
               c("determine", "determinant"),
               c("developmental", "development"),
               c("dispersers","dispersal"),
               c("disturbed", "disturbance"),
               c("diversification", "diversity"),
               c("dragonflies", "dragonfly"),
               c("drier", "drought"),
               c("ecological", "ecology"),
               c("ecologists", "ecology"),
               c("endemic", "endemism"),
               c("effectiveness", "efficiency"),
               c("environmental", "environment"),
               c("evolutionary", "evolution"),
               c("expanding", "expansion"),
               c("extinct", "extinction"),
               c("facilitate", "facilitation"),
               c("fisheries", "fishery"),
               c("floral", "flora"),
               c("floristic", "flora"),
               c("forested", "forest"),
               c("functional", "function"),
               c("functionally", "function"),
               c("functioning", "function"),
               c("geographical", "geographic"),
               c("heterogeneties", "heterogeneity"),
               c("heterogeneous", "heterogeneity"),
               c("histories", "history"),
               c("integrated", "integration"),
               c("intregating", "integration"),
               c("integrative", "integration"),
               c("invasive", "invasion"),
               c("isotopic", "isotope"),
               c("linking", "link"),
               c("living", "live"),
               c("mammalia", "mammal"),
               c("managed", "manage"),
               c("managers", "manage"),
               c("mathematical", "mathematics"),
               c("mates", "mating"),
               c("mediated", "mediate"),
               c("mechanistic", "mechanism"),
               c("matrices", "matrix"),
               c("migratory", "migration"),
               c("mimicking", "mimicry"),
               c("modeling", "model"),
               c("mutualistic", "mutualism"),
               c("natural", "nature"),
               c("neotropical", "neotropic"),
               c("northeastern", "northeast"),
               c("occuring", "occur"),
               c("onça", "onca"),
               c("opiliones", "opilion"),
               c("parasite", "parasitism"),
               c("parent", "parenting"),
               c("phylogenies", "phylogeny"),
               c("phylogenetic", "phylogeny"),
               c("phylogenomic", "phylogeny"),
               c("pollinators", "pollination"),
               c("protected", "protect"),
               c("protective", "protect"),
               c("rainfall", "rain"),
               c("reconstructing", "reconstruction"),
               c("regulatory", "regulation"),
               c("regulates", "regulation"),
               c("relation", "relationship"),
               c("reproductive", "reproduction"),
               c("restored", "restoration"),
               c("robustness", "robust"),
               c("scientific", "science"),
               c("scientist", "science"),
               c("sexy", "sexual"),
               c("simulated", "simulation"),
               c("societies", "society"),
               c("social", "society"),
               c("socio", "society"),
               c("space", "spatial"),
               c("spacio", "spatial"),
               c("stabilize", "stability"),
               c("stable", "stability"),
               c("stories", "story"),
               c("strategic", "strategy"),
               c("strategies", "strategy"),
               c("structured", "structure"),
               c("structuring", "structure"),
               c("studies", "study"),
               c("studing", "study"),
               c("sustainable", "sustainability"),
               c("theories", "theory"),
               c("theoretical", "theory"),
               c("threatened", "threat"),
               c("tropical", "tropic"),
               c("vision", "visual")
               )
lemma <- as.data.frame(lemma)

for (i in 1:dim(lemma)[1]){
  text$word[text$word == lemma[i,1]] <- lemma[i,2]
}

Counting words Frequency by gender

Removing stopwords, we keep 2299 words.

table(text$gender)
## 
##    F    M 
## 1062 1237
table(text$position_cat, text$gender)
##            
##               F   M
##   others     16  10
##   postdoc   179 230
##   professor 163 437
##   student   693 549
pala <- text %>%
  count(word) 

20 palavra mais comuns

text %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
word n
ecology 50
forest 42
evolution 32
landscape 25
bird 22
model 22
diversity 21
species 21
environment 19
plant 18
structure 17
atlantic 15
brazil 15
effects 15
conservation 14
interaction 13
study 13
bee 12
community 12
network 12
patterns 12
sĂŁo 12

word cloud

textplot_wordcloud(x=dfm(tokens(text$word)))

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
                   col="#FCA532")

Word frequencies by gender

props <- text %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]
ggplot(props, aes(x=proportion_M,, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
  #geom_point(size=2.5, alpha=0.5)+
  geom_jitter(size=2.5, alpha=0.5)+
  geom_text_repel(aes(label=label), size=3)+
  scale_x_log10(name="Male most used words",
                labels = percent_format(), limits=c(0.0005,0.03)) +
  scale_y_log10(name="Female Most used words",
                labels = percent_format(),limits=c(0.0005,0.03)) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/title_wordFreq.jpg", height = 5, width=7)

Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females. Top 20 words by absolute differences are also indicated in text.

Correlation of word frequeency use between gender:

cor.test(props$proportion_F, props$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  props$proportion_F and props$proportion_M
## t = 14.789, df = 225, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6294218 0.7625874
## sample estimates:
##       cor 
## 0.7020916

Highly correlated -> it means they tend to use the same frequency of main words

prop2 <- props %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))

ggsave("figures/title_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)

PROFESSOR Word frequencies by gender

textprof <- text[text$position_cat == "professor",]
par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(textprof$word[textprof$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(textprof$word[textprof$gender=="M"])),
                   col="#FCA532")

propsP <- text %>% filter(position_cat == "professor")%>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]
ggplot(propsP, aes(x=proportion_M, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
 # geom_point(size=2.5, alpha=0.3) +
  geom_jitter(size=2.5, alpha=0.3)+
  geom_text_repel(aes(label=label), size=3)+
  scale_x_log10(name="Male most used words",
                labels = percent_format()) +
  scale_y_log10(name="Female Most used words",
                labels = percent_format()) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)

Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.

Correlation of word frequeency use between gender:

cor.test(propsP$proportion_F, propsP$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  propsP$proportion_F and propsP$proportion_M
## t = 0.98729, df = 40, p-value = 0.3294
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1570569  0.4376523
## sample estimates:
##       cor 
## 0.1542369

No correlation

20 palavras mais usadas

propsP %>% pivot_longer(cols=2:3, names_to = "gender", values_to = "prop") %>%
  arrange(desc(prop)) %>% slice_head(n=20) %>% pull(label) %>% unique() -> pala

propsP3 <- propsP %>% filter(word %in% pala) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(propsP3, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
                     labels = c(0.03,0.02,-0.01,0,0.01,0.02,0.03))

20 words with the largest differences in frequency

propP2 <- propsP %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
                     labels = c(0.03,0.02,-0.01,0,0.01,0.02,0.03))

ggsave("figures/title_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)

TF IDF

OBS: essas ana’lises nao ajudaram muito, tlvz nem precisem mais ficar aqui

text_id <- text %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))
#text_id
text_id$word <- as.factor(text_id$word)
text_id %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(5, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

TF IDF professors

OBS: essas ana’lises nao ajudaram muito, tlvz nem precisem mais ficar aqui

text_idP <- text %>% filter(position_cat== "professor") %>% 
  count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))
#text_id
text_idP$word <- as.factor(text_idP$word)
text_idP %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(5, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

Topic model

matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)

ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)
##         AIC     dAIC    df  
## ap_lda2 33766.0     0.0 2099
## ap_lda3 33853.1    87.1 3148
## ap_lda4 35215.2  1449.2 4197

word-topic probabilities

ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

Document-topic probabilities

ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)
##    
##      1  2
##   F 71 62
##   M 83 94
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) 
##  gender   1   2
##       F 53% 47%
##       M 47% 53%
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ gender)

Topic model Professors only

matext <- text %>% filter(position_cat=="professor") %>%
  count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)

ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)
##         AIC    dAIC   df  
## ap_lda2 8479.2    0.0 793 
## ap_lda4 9086.4  607.2 1585
## ap_lda3 9271.1  791.9 1189

word-topic probabilities

ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

Document-topic probabilities

ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)
##    
##      1  2
##   F 13 11
##   M 38 34
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) 
##  gender   1   2
##       F 54% 46%
##       M 53% 47%
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  geom_violin()+
  facet_wrap(~ gender)

ABSTRAC - semitniment analysis

library(tidytext)

Chapter 2, Silge & RObinson. 2018

  • The NRC lexi‐ con categorizes words in a binary fashion (“yes”/“no”) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")
## # A tibble: 13,875 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # â„č 13,865 more rows
  • The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # â„č 6,776 more rows
  • The AFINN lexicon assigns words with a score that runs between -5 and 5, with neg‐ ative scores indicating negative sentiment and positive scores indicating positive sen‐ timent.
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # â„č 2,467 more rows
  • Another one in the package
get_sentiments("loughran")
## # A tibble: 4,150 × 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # â„č 4,140 more rows

Score words difference in female and male abstracts

affword <- get_sentiments("afinn")

affc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(affword, "word")
affc %>% group_by(id, gender) %>%
  summarise(mean.score = mean(value),
            weig.score = weighted.mean(value,n)) %>%
  ggplot(aes(x=gender,y=mean.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  ggtitle("Mean words score per abstract and gender")

  #ggbeeswarm::geom_beeswarm(size=3, shape=21)
affc %>% group_by(id, gender) %>%
  summarise(mean.score = mean(value),
            weig.score = weighted.mean(value,n)) %>%
  ggplot(aes(x=gender,y=weig.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  ggtitle("Weighted mean words score per abstract and gender")

  #ggbeeswarm::geom_beeswarm(size=3, shape=21)

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica

nrcword <- get_sentiments("nrc")

nrc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(nrc, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin()

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica

bingword <- get_sentiments("bing")

bing <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin()

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica

louword <- get_sentiments("loughran")

lou <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(louword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin()